DSAN 5100 Hypothesis Testing

Author

Emily Mitchum

Imports

library(stats)
library(ggplot2)
library(dplyr)

Attaching package: 'dplyr'
The following objects are masked from 'package:stats':

    filter, lag
The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union

Read in Clean Data

earnings_df <-read.csv('/Users/Emi/Library/CloudStorage/GoogleDrive-emilymitchum265@gmail.com/Shared drives/DSAN/DSAN 5100/Final Project/vocational_school_employment_outcomes/Cleaned data/Cleaned_quarterly_earnings_data.csv')
head(earnings_df)
        date education_level median_weekly_earnings
1 2019-01-01         ba_only                   1213
2 2019-04-01         ba_only                   1236
3 2019-07-01         ba_only                   1281
4 2019-10-01         ba_only                   1259
5 2020-01-01         ba_only                   1263
6 2020-04-01         ba_only                   1303
tuition_df <- read.csv('/Users/Emi/Library/CloudStorage/GoogleDrive-emilymitchum265@gmail.com/Shared drives/DSAN/DSAN 5100/Final Project/vocational_school_employment_outcomes/Cleaned data/Cleaned_tuition_data.csv')
head(tuition_df)
  Academic.Year average_tuition     degree_granting year_parsed
1       2019-20           15747 Non-Degree Granting  2019-01-01
2       2019-20           25447     Degree Granting  2019-01-01
3       2020-21           26084     Degree Granting  2020-01-01
4       2020-21           15755 Non-Degree Granting  2020-01-01
5       2021-22           26572     Degree Granting  2021-01-01
6       2021-22           15978 Non-Degree Granting  2021-01-01
  public_or_private
1            public
2            public
3            public
4            public
5            public
6            public

ANOVA on Quarterly Median Weekly Earnings Data

# Correct data types before ANOVA
earnings_df <- earnings_df %>%
  mutate(
    year = factor(format(as.Date(date), "%Y")),
    education_level = factor(education_level)
  )

tuition_df <- tuition_df |> mutate(year = factor(format(as.Date(year_parsed), "%Y")))

Visual Analysis of Earnings by Education Level

library(ggplot2)

ggplot(earnings_df, aes(
  x = education_level,
  y = median_weekly_earnings,
  fill = education_level
)) +
  geom_boxplot(alpha = 0.85, outlier.color = "gray40") +
  scale_fill_brewer(palette = "Set2") +   # cleaner, professional palette
  labs(
    title = "Median Weekly Earnings by Education Level",
    x = "Education Level",
    y = "Median Weekly Earnings"
  ) +
  theme_minimal(base_size = 16) +         # larger base text for slides
  theme(
    text = element_text(family = "Helvetica", face = "bold"),
    plot.title = element_text(size = 22, face = "bold", hjust = 0.5),
    axis.title.x = element_text(size = 18, face = "bold"),
    axis.title.y = element_text(size = 18, face = "bold"),
    axis.text.x = element_text(size = 14, face = "bold", angle = 20, hjust = 1),
    axis.text.y = element_text(size = 14, face = "bold"),
    legend.position = "none",             # removes redundant legend
    panel.grid.major.x = element_blank()  # cleaner slide aesthetic
  )

One-Way ANOVA (Ignoring Time)

oneway <- aov(median_weekly_earnings ~ education_level, data = earnings_df)
summary(oneway)
                Df  Sum Sq Mean Sq F value Pr(>F)    
education_level  2 4505007 2252503   264.3 <2e-16 ***
Residuals       75  639267    8524                   
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

The one-way ANOVA test revealed a highly significant effect of education level on median weekly earnings. This indicates that mean earnings differ substantially across the three education groups.

Two-Way ANOVA (Analyzing Effect of Education Level & Time)

model_two_way <- aov(median_weekly_earnings ~ education_level * year, data = earnings_df)

summary(model_two_way)
                     Df  Sum Sq Mean Sq  F value   Pr(>F)    
education_level       2 4505007 2252503 4339.900  < 2e-16 ***
year                  6  578857   96476  185.881  < 2e-16 ***
education_level:year 12   30826    2569    4.949 1.54e-05 ***
Residuals            57   29584     519                      
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Bachelor’s degree holders consistently earn much more than those with some college/associate degrees, who in turn earn more than those with only a high school diploma.

# map <- c("Non-Degree Granting" = "Vocational", "Degree Granting" = "Traditional Bachelor's")

limited_earnings_df <- earnings_df[earnings_df$education_level != "high_school",] 

limited_earnings_df <- limited_earnings_df |> 
  mutate(degree_category = case_when(
    education_level == "ba_only" ~ "Bachelor's",education_level == "some_college_assoc" ~ "Vocational",
    TRUE ~ "NA"
  ))

tuition_df <- tuition_df |> 
  mutate(degree_category = case_when(
    degree_granting == "Degree Granting" ~ "Bachelor's",
    degree_granting == "Non-Degree Granting" ~ "Vocational",
    TRUE ~ "NA"
  ))

Join Dataframes together to assess ROI

temp_earnings_df <- limited_earnings_df |> select(c("median_weekly_earnings","year","degree_category"))
temp_tuition_df <- tuition_df |> select(c("average_tuition","year","degree_category"))

merged_df <- merge(temp_earnings_df, temp_tuition_df, by = c("degree_category","year"))

roi_df <- merged_df |> mutate( ROI = median_weekly_earnings / average_tuition )

roi_yearly <- roi_df %>%
  group_by(degree_category, year) %>%
  summarize(ROI = mean(ROI), .groups = "drop") %>%
  arrange(degree_category, year)

roi_clean <- merged_df |>
  dplyr::group_by(degree_category, year) |>
  dplyr::summarise(
    annual_earnings = mean(median_weekly_earnings) * 52,
    tuition  = mean(average_tuition),
    ROI      = annual_earnings / tuition,  # annualized ROI
    .groups = "drop"
  )


# plot <- plot_ly(
#   roi_clean,
#   x = ~year,
#   y = ~ROI,
#   color = ~degree_category,
#   colors = c("Bachelor's" = "#E64B35", "Vocational" = "#4DBBD5"),
#   type = 'scatter',
#   mode = 'lines+markers'
# ) %>%
#   layout(
#     title = "Return on Investment (ROI) Over Time",
#     xaxis = list(title = "Year"),
#     yaxis = list(title = "ROI (Median Weekly Earnings / Tuition)"),
#     legend = list(title = list(text = "<b>Degree Type</b>"))
#   )

# htmlwidgets::saveWidget(plot, "roi_plot.html")

# roi_yearly
library(plotly)

Attaching package: 'plotly'
The following object is masked from 'package:ggplot2':

    last_plot
The following object is masked from 'package:stats':

    filter
The following object is masked from 'package:graphics':

    layout
plot <- plot_ly(roi_clean, x = ~year) %>%
  add_trace(
    y = ~ROI,
    color = ~degree_category,
    colors = c("Bachelor's" = "#E64B35", "Vocational" = "#4DBBD5"),
    type = "scatter",
    mode = "lines+markers",
    name = ~paste(degree_category, "ROI"),
    yaxis = "y"
  ) %>%

  add_trace(
    y = ~annual_earnings,
    x = ~year,
    split = ~degree_category,
    type = "bar",
    name = ~paste(degree_category, "Earnings"),
    opacity = 0.4,
    yaxis = "y2",
    showlegend = TRUE
  ) %>%

  add_trace(
    y = ~tuition,
    x = ~year,
    split = ~degree_category,
    type = "bar",
    name = ~paste(degree_category, "Tuition"),
    opacity = 0.4,
    yaxis = "y2",
    showlegend = TRUE
  ) %>%
  layout(
    title = "ROI with Earnings and Tuition Over Time",
    barmode = "group",
    xaxis = list(title = "Year"),
    yaxis = list(
      title = "ROI (Earnings / Tuition)",
      rangemode = "tozero"
    ),
    yaxis2 = list(
      title = "Dollars (Earnings & Tuition)",
      overlaying = "y",
      side = "right",
      rangemode = "tozero"
    ),
    legend = list(title = list(text = "<b>Series</b>"))
  )

htmlwidgets::saveWidget(plot, "roi_plot.html")

plot